notes taken by Anne during R-meetup with Suny, about data preparation/visualisation for the eurostat refugee data.

aim:

push the data around so that it’s easy to plot rejected vs. total number of applications per country of origin & destination.

current status:

install package: set eval= TRUE if you need to do this

1: install the devtools package

install.packages("devtools")

2: install eurostatasyl package from github:

devtools::install_github("alrutten/r-eurostat-refugees")

2a: if you run into an error about the SSL CA cert, set your curl options to ignore this:

require(httr)
set_config( config( ssl_verifypeer = 0L ) )

3: require the eurostat package. hmm dependencies don’t get libraried automatically. require(eurostatasyl) require(dplyr) require(tidyr) require(eurostat)

work with the data

1: download the data from eurostat, and load into a dataframe. Change data directory first!

#load_data_acceptance_api() #downloads SOMETHING. But what is it?
#d <- load_data_acceptance_file("/tmp/RtmpvjmLel/eurostat/migr_asydcfstq_date_code_TF.rds")
#d2 <- load_data_acceptance_file("/home/anne/personal/eurostat/r-eurostat-refugees/data/data_acceptance")
#labeling is not working, load data directly:
d <- readRDS("/home/anne/personal/eurostat/r-eurostat-refugees/data/data_acceptance")

2: generate citizen-geo-year_-NRejected-NTotal dataframe (with geo- and citizenTotals too)

d_perYearCitizenGeoDecision <- d %>%
                              # filter on totals for gender and age, filter out totals for geo (destination) and citizen(origin)
                              filter(age == "Total",
                              sex == "Total",
                              decision %in% c("Rejected", "Total_positive_decisions", "Total"),
                              citizen != "Total",
                              geo != "Total",
                              !grepl("(EU)|(European)",citizen)) %>%   # rejected based on Dublin?
                              # extract year_
                              mutate(year_ = substr(time,1,4)) %>%
                              # calculate totals per country of origin
                              group_by(citizen) %>%
                              mutate(originTotals = sum(values[decision!="Total"])) %>%
                              # create splitting variable based on totals per country of origin.
                              ungroup() %>%
                              mutate(origin_split = as.numeric(originTotals<= median(originTotals))) %>%
                              # calculate totals per country of destination
                              group_by(geo) %>%
                              mutate(destinationTotals = sum(values[decision!="Total"])) %>%
                              # calculate totals per origin-destination-year-decision  
                              group_by(citizen, geo, year_, decision, origin_split, originTotals, destinationTotals) %>%
                              summarise(totalCount = sum(values)) %>%
                              # get each decision level in it's own column 
                              spread(decision, totalCount )

plotting: overview

1: make overview plotlist (alternatively, you can use facet_wrap(), but that’s harder to control). Note that i didn’t logtransform the totals.

dotplotlist <- d_perYearCitizenGeoDecision %>% 
              ungroup() %>%
              group_by(year_, origin_split) %>%
              do(dotplot =ggplot(., aes(substr(citizen,1,14), substr(geo,1,14), size=Total, colour = Rejected/Total)) +
                            geom_point() +
                            scale_colour_gradient(low = "blue", high = "orange") +
                            theme(axis.text.x = element_text(angle = 90, vjust = 0.5,hjust=0)) + 
                            ggtitle(paste0("year: ",.$year_, ifelse(.$origin_split==0," higher number of applications"," lower number of applications"))) +
                            labs(colour = "proportion rejected",
                                 x = "country of origin",
                                 y = "country of destination")
              )

2: plot all the plots

for (i in 1:nrow(dotplotlist)) print(dotplotlist$dotplot[[i]])

plotting: per country of origin (for instance). shh, it’s pie charts..

pie size corresponds with log(Total).

focalOrigin = "Eritrea"
selectedData <- filter(d_perYearCitizenGeoDecision, citizen==focalOrigin) %>%
                mutate(Accepted = Total-Rejected) %>%
                gather(decision, value, Rejected, Accepted)

ggplot(selectedData, aes(x=log(Total+1)/2, y = value, fill = decision, width =log(Total+1))) +
  geom_bar(position = "fill", stat="identity") + 
  facet_grid(year_ ~ geo, switch = "y") +
  coord_polar("y") +
  scale_fill_manual(values = c("blue","orange")) +
  ggtitle(paste("country of origin:",focalOrigin)) +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        strip.text.x = element_text(angle = 90, hjust=0.5, vjust=0),
        strip.text.y = element_text(angle = 180),
        axis.ticks = element_blank()) 
## Warning: Removed 4 rows containing missing values (position_stack).
## Warning: Removed 222 rows containing missing values (geom_bar).